knitr::opts_chunk$set(echo = TRUE)
library(usmap)
library(ggplot2)
library(maps)
library(ggsn)
## Loading required package: grid
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::map() masks maps::map()
This provides insight and analysis into Beer and Brewery data across the U.S. provided by Budweiser Executives for strategic decision making.
#library(tidyverse)
beers = read_csv(url("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Beers.csv"))
## Rows: 2410 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Name, Style
## dbl (5): Beer_ID, ABV, IBU, Brewery_id, Ounces
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
breweries = read_csv(url("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Breweries.csv"))
## Rows: 558 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Name, City, State
## dbl (1): Brew_ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(beers)
## Name Beer_ID ABV IBU
## Length:2410 Min. : 1.0 Min. :0.00100 Min. : 4.00
## Class :character 1st Qu.: 808.2 1st Qu.:0.05000 1st Qu.: 21.00
## Mode :character Median :1453.5 Median :0.05600 Median : 35.00
## Mean :1431.1 Mean :0.05977 Mean : 42.71
## 3rd Qu.:2075.8 3rd Qu.:0.06700 3rd Qu.: 64.00
## Max. :2692.0 Max. :0.12800 Max. :138.00
## NA's :62 NA's :1005
## Brewery_id Style Ounces
## Min. : 1.0 Length:2410 Min. : 8.40
## 1st Qu.: 94.0 Class :character 1st Qu.:12.00
## Median :206.0 Mode :character Median :12.00
## Mean :232.7 Mean :13.59
## 3rd Qu.:367.0 3rd Qu.:16.00
## Max. :558.0 Max. :32.00
##
summary(breweries)
## Brew_ID Name City State
## Min. : 1.0 Length:558 Length:558 Length:558
## 1st Qu.:140.2 Class :character Class :character Class :character
## Median :279.5 Mode :character Mode :character Mode :character
## Mean :279.5
## 3rd Qu.:418.8
## Max. :558.0
head(beers)
## # A tibble: 6 × 7
## Name Beer_ID ABV IBU Brewery_id Style Ounces
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 Pub Beer 1436 0.05 NA 409 American Pale Lager 12
## 2 Devil's Cup 2265 0.066 NA 178 American Pale Ale (… 12
## 3 Rise of the Phoenix 2264 0.071 NA 178 American IPA 12
## 4 Sinister 2263 0.09 NA 178 American Double / I… 12
## 5 Sex and Candy 2262 0.075 NA 178 American IPA 12
## 6 Black Exodus 2261 0.077 NA 178 Oatmeal Stout 12
head(breweries)
## # A tibble: 6 × 4
## Brew_ID Name City State
## <dbl> <chr> <chr> <chr>
## 1 1 NorthGate Brewing Minneapolis MN
## 2 2 Against the Grain Brewery Louisville KY
## 3 3 Jack's Abby Craft Lagers Framingham MA
## 4 4 Mike Hess Brewing Company San Diego CA
## 5 5 Fort Point Beer Company San Francisco CA
## 6 6 COAST Brewing Company Charleston SC
#state_data -- Breweries Filtered by State - joined with FIPS code by State
#mutate - Join FIPS data with Breweries data
#trimws - remove Leading/Trailing Whitespace from column(State)
#fips = State by Abbreviation - ie. TX
state_data = breweries %>% count(State) %>% mutate(fips = fips(trimws(State)))
us_map <- usmap::us_map() # used to add map scale
usmap::plot_usmap(data = state_data, values = "n", labels = T)+
labs(fill = 'Breweries by State') +
scale_fill_gradientn(colours=rev(topo.colors(50)),na.value="grey90",
guide = guide_colourbar(barwidth = 25, barheight = 0.6,
#put legend title on top of legend
title.position = "top")) +
# put legend at the bottom, adjust legend title and text font sizes
theme(legend.position = "bottom",
legend.title=element_text(size=12),
legend.text=element_text(size=10))
#Merge beer and breweries data - trim whitespace - add in FIPS codes
full_data = merge(beers,breweries,by.y="Brew_ID",by.x="Brewery_id") %>%
mutate(fips = fips(trimws(State)))
head(full_data)
## Brewery_id Name.x Beer_ID ABV IBU
## 1 1 Get Together 2692 0.045 50
## 2 1 Maggie's Leap 2691 0.049 26
## 3 1 Wall's End 2690 0.048 19
## 4 1 Pumpion 2689 0.060 38
## 5 1 Stronghold 2688 0.060 25
## 6 1 Parapet ESB 2687 0.056 47
## Style Ounces Name.y City
## 1 American IPA 16 NorthGate Brewing Minneapolis
## 2 Milk / Sweet Stout 16 NorthGate Brewing Minneapolis
## 3 English Brown Ale 16 NorthGate Brewing Minneapolis
## 4 Pumpkin Ale 16 NorthGate Brewing Minneapolis
## 5 American Porter 16 NorthGate Brewing Minneapolis
## 6 Extra Special / Strong Bitter (ESB) 16 NorthGate Brewing Minneapolis
## State fips
## 1 MN 27
## 2 MN 27
## 3 MN 27
## 4 MN 27
## 5 MN 27
## 6 MN 27
tail(full_data)
## Brewery_id Name.x Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Name.y City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State fips
## 2405 CA 06
## 2406 NY 36
## 2407 NY 36
## 2408 NY 36
## 2409 NY 36
## 2410 AK 02
full_data[sample(1:nrow(full_data),6),]
## Brewery_id Name.x Beer_ID ABV IBU
## 1648 323 Good Vibes IPA 1603 0.073 85
## 1623 315 Old Tom Porter 847 0.055 25
## 1625 315 McKinney Eddy Amber Ale 521 0.055 20
## 829 140 Arkansas Red 1650 0.052 NA
## 2209 479 Coffee Oatmeal Stout 311 0.060 54
## 2139 454 Mother Ale 1159 0.056 46
## Style Ounces Name.y
## 1648 American IPA 12 Voodoo Brewery
## 1623 American Porter 16 Piney River Brewing Company
## 1625 American Amber / Red Ale 16 Piney River Brewing Company
## 829 American Amber / Red Ale 12 Core Brewing & Distilling Company
## 2209 Oatmeal Stout 12 Good People Brewing Company
## 2139 American Blonde Ale 12 Denali Brewing Company
## City State fips
## 1648 Meadville PA 42
## 1623 Bucryus MO 29
## 1625 Bucryus MO 29
## 829 Springdale AR 05
## 2209 Birmingham AL 01
## 2139 Talkeetna AK 02
Two Datasets, “Beers.csv” and “Breweries.csv” were joined to create a Full Dataset for analysis and insight
Only ABV and IBU are missing Data Values
Important to note: IBU is missing values in around 42% of the 2,410 total rows of data
**Predictions for IBU specifically, may be skewed
#use colSum to sum up the NA's in each column
full_dataNAs <- as.data.frame(colSums(is.na(full_data)))
names(full_dataNAs) <- c("Missing Values (NAs)")
full_dataNAs
## Missing Values (NAs)
## Brewery_id 0
## Name.x 0
## Beer_ID 0
## ABV 62
## IBU 1005
## Style 5
## Ounces 0
## Name.y 0
## City 0
## State 0
## fips 0
summary(full_data)
## Brewery_id Name.x Beer_ID ABV
## Min. : 1.0 Length:2410 Min. : 1.0 Min. :0.00100
## 1st Qu.: 94.0 Class :character 1st Qu.: 808.2 1st Qu.:0.05000
## Median :206.0 Mode :character Median :1453.5 Median :0.05600
## Mean :232.7 Mean :1431.1 Mean :0.05977
## 3rd Qu.:367.0 3rd Qu.:2075.8 3rd Qu.:0.06700
## Max. :558.0 Max. :2692.0 Max. :0.12800
## NA's :62
## IBU Style Ounces Name.y
## Min. : 4.00 Length:2410 Min. : 8.40 Length:2410
## 1st Qu.: 21.00 Class :character 1st Qu.:12.00 Class :character
## Median : 35.00 Mode :character Median :12.00 Mode :character
## Mean : 42.71 Mean :13.59
## 3rd Qu.: 64.00 3rd Qu.:16.00
## Max. :138.00 Max. :32.00
## NA's :1005
## City State fips
## Length:2410 Length:2410 Length:2410
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
1005/2410 #Calculate percentage missing data from IBU Column
## [1] 0.4170124
#Preliminary Steps
#Import Beer and Brewery Data Sets
Beer = read.csv("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Beers.csv", header = TRUE)
Breweries = read.csv("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Breweries.csv", header = TRUE)
#Updating the name of the first column of the breweries dataset so there is a matching column between the two datasets
names(Breweries)[1] <- 'Brewery_id'
#Merging the two sets together by the Brewery ID column
BB = merge(Beer,Breweries, by="Brewery_id", all = TRUE)
#Updating the mame of the 8th column so that it is Brewery
names(BB)[8] <- 'Brewery'
#Omitting all N/A values from the ABV and IBU columns
x = na.omit(BB[,c(4,10)])
y = na.omit(BB[,c(5,10)])
Alcohol by Volume Comparison by State
#Finding the median ABV of each state
medians = aggregate(x=x$ABV, by = list(x$State), FUN = median)
#Finding the median IBU of each state
mediansIBU = aggregate(x=y$IBU, by = list(y$State), FUN = median)
#Updating column names of the median ABV dataset
names(medians)[1]<- 'State'
names(medians)[2]<- 'ABV'
#Updating column names of the median IBU dataset
names(mediansIBU)[1]<- 'State'
names(mediansIBU)[2]<- 'IBU'
#Plotting the median ABV of each state
p = medians %>%
ggplot(mapping = aes(x=State, y=ABV)) + geom_bar(stat = 'identity', fill = topo.colors(51), width = .5,color = 'black', show.legend = TRUE) + labs(y= "Median ABV", title = "Median ABV of Beers by State")
ggplotly(p) #Add interactive data labels
Bitterness Comparison by State
#Plotting the median IBU of each state
p = mediansIBU %>%
ggplot(mapping = aes(x=State, y=IBU)) + geom_bar(stat = 'identity', fill = topo.colors(50), width = .5,color = 'black') + labs(y= "Median IBU", title = "Median IBU of Beers by State")
ggplotly(p) #Add interacitve data labels
5a. Which state has the maximum alcoholic (ABV) beer? #Max ABV = CO
#Finding the highest ABV of all the states
MaxABV = aggregate(x$ABV, by = list(x$State),max)
#Finding the highest IBU of all the states
MaxIBU = aggregate(y$IBU, by = list(y$State),max)
#Updating column names of the max IBU dataset
names(MaxIBU)[1]<- 'State'
names(MaxIBU)[2]<- 'IBU'
#Updating column names of the max ABV dataset
names(MaxABV)[1]<- 'State'
names(MaxABV)[2]<- 'ABV'
#Order the MAX ABV and IBU datasets in descending order
MaxABV = BB[order(BB$ABV, decreasing = TRUE),]
MaxIBU = BB[order(BB$IBU, decreasing = TRUE),]
#Plotting the Max ABV of each state
p = MaxABV %>%
ggplot(mapping = aes(x=State, y=ABV)) + geom_bar(stat = "identity", fill = "darkslateblue") + labs(y="ABV", title = "Highest ABVs per State")
ggplotly(p)
## Warning: Removed 62 rows containing missing values (position_stack).
5b. Which state has the most bitter (IBU) beer? #Most Bitter Beer = OR
#Plotting the Max IBU of each state
p = MaxIBU %>%
ggplot(mapping = aes(x=State, y=IBU)) + geom_bar(stat = "identity", fill = "chartreuse3", width = .5,color = 'darkslateblue') + labs(y="IBU", title = "Highest IBUs per State")
ggplotly(p)
## Warning: Removed 1005 rows containing missing values (position_stack).
#Budweiser
Beers = read_csv(url("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Beers.csv"))
## Rows: 2410 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Name, Style
## dbl (5): Beer_ID, ABV, IBU, Brewery_id, Ounces
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Breweries = read_csv(url("https://raw.githubusercontent.com/BivinSadler/MSDS_6306_Doing-Data-Science/Master/Unit%208%20and%209%20Case%20Study%201/Breweries.csv"))
## Rows: 558 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Name, City, State
## dbl (1): Brew_ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#merge the data before performing EDA
names(Breweries)[1] <- 'Brewery_id'
Breweries
## # A tibble: 558 × 4
## Brewery_id Name City State
## <dbl> <chr> <chr> <chr>
## 1 1 NorthGate Brewing Minneapolis MN
## 2 2 Against the Grain Brewery Louisville KY
## 3 3 Jack's Abby Craft Lagers Framingham MA
## 4 4 Mike Hess Brewing Company San Diego CA
## 5 5 Fort Point Beer Company San Francisco CA
## 6 6 COAST Brewing Company Charleston SC
## 7 7 Great Divide Brewing Company Denver CO
## 8 8 Tapistry Brewing Bridgman MI
## 9 9 Big Lake Brewing Holland MI
## 10 10 The Mitten Brewing Company Grand Rapids MI
## # … with 548 more rows
Beer_Merge <- merge(x=Beers,y=Breweries, by = "Brewery_id", all.x = TRUE)
lookup = data.frame(abb = state.abb, State = state.name) #makes a data frame with State name and abbreviation.
summary(Beer_Merge$ABV) #see summary statistics of mean, median, mode, and quartlies
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00100 0.05000 0.05600 0.05977 0.06700 0.12800 62
#plot the distribution of ABV
Beer_Merge %>%
ggplot(aes(x = ABV)) + geom_density(binwidth = .5, colour="black",fill="chartreuse3") +
geom_vline(aes(xintercept=mean(ABV, na.rm=T)),
color = "red", linetype = "dashed", size = 1) +
ggtitle("Distribution Density of ABV")
## Warning: Ignoring unknown parameters: binwidth
## Warning: Removed 62 rows containing non-finite values (stat_density).
There appears to a positive correlation between ABV and IBU however, IBU values may be skewed due to missing IBU data - 42% of total missing
Beer_Merge %>%
ggplot(aes(x = ABV, y = IBU, fill = "sample")) +
geom_point() + geom_smooth(method = lm) +
ggtitle("Relationship Between IBU and ABV")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1005 rows containing non-finite values (stat_smooth).
## Warning: Removed 1005 rows containing missing values (geom_point).
correlation_test = cor.test(Beer_Merge$ABV,Beer_Merge$IBU) #run correlation to determine p value and r-sqaured
correlation_test #showcase statistics
##
## Pearson's product-moment correlation
##
## data: Beer_Merge$ABV and Beer_Merge$IBU
## t = 33.863, df = 1403, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6407982 0.6984238
## sample estimates:
## cor
## 0.6706215
#The scatterplot shows a positive correlation between level of bitterness and alcohol by volume indicating that the higher the bitterness the more alcohol that tends to be contatined within
#Even with a sample of 1005 beers out of 2410 observations in total of IBU, we can conclude that a p value of 2.2e-16 we can say,
#we should reject the null hypothesis given an intercept of 0, which is further proved by the confidence interval
#being .64 to .69 at alpha of .05 or 95%.
#There appears to be a strong relationship with a r-squared correlation of .6706 squared = .45.
#It's a fair to state that when the ABV or IBU is increased the other shall also increase.
##Bonus Insights
#Top 2 States _____#CO | #CA - Top Beer Styles for Top 2 States
COStyle = full_data %>% filter(fips == "08") %>% select(State,Style) %>% group_by(State) %>%
count(Style) %>% slice_max(n,n=5) %>% mutate(fips = fips(trimws(State)))
#Colorado Top Beer Styles BarChart
p = COStyle %>%
arrange(desc(COStyle$Style)) %>%
mutate(Style = factor(Style)) %>%
ggplot(aes(y=Style,x=COStyle$n, fill = COStyle$n)) + geom_bar(stat='identity', show.legend = TRUE, width = .8) +
labs(fill = 'CO Beer Styles') +
ggtitle("Top Colorado Beer Styles")
ggplotly(p)
CAStyle = full_data %>% filter(fips == "06") %>% select(State,Style) %>% group_by(State) %>%
count(Style) %>% slice_max(n,n=5) %>% mutate(fips = fips(trimws(State)))
#California Top Beer Styles BarChart
p = CAStyle %>%
arrange(desc(CAStyle$Style)) %>%
mutate(Style = factor(Style)) %>%
ggplot(mapping = aes(y=Style,x=CAStyle$n, fill = CAStyle$n)) + geom_bar(stat='identity', show.legend = TRUE, width = .8) +
labs(fill = 'CA Beer Styles') +
ggtitle("Top California Beer Styles")
ggplotly(p)